from clustergrammer_widget import *
net = Network(clustergrammer_widget)
df = {}
import clustergrammer_groupby as cby
import gene_exp_10x
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
%matplotlib inline
net.load_file('../data/mnist.txt')
net.normalize(axis='row', norm_type='zscore')
net.swap_nan_for_zero()
df['ini'] = net.export_df()
df['ini'].shape
net.load_df(df['ini'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Zero', inst_color='yellow')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: One', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Two', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Three', inst_color='green')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Four', inst_color='black')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Five', inst_color='orange')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Six', inst_color='purple')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Seven', inst_color='grey')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Eight', inst_color='white')
net.set_cat_color(axis='col', cat_index=1, cat_name='Digit: Nine', inst_color='brown')
net.load_df(df['ini'])
net.cluster()
net.widget()
pval_cutoff = 0.001
df['cat_sig'], keep_genes, keep_genes_dict = cby.generate_signatures(df['ini'], 'Digit',
pval_cutoff=pval_cutoff)
print(df['cat_sig'].shape)
df['pred_cat'], df['sig_sim'], y_info = cby.predict_cats_from_sigs(df['ini'], df['cat_sig'],
predict_level='Predict Digit')
df['conf'], populations, ser_correct, fraction_correct = cby.confusion_matrix_and_correct_series(y_info)
real_fraction_correct = deepcopy(fraction_correct)
print(real_fraction_correct)
df['cat_sig'].shape
net.load_df(df['cat_sig'])
net.cluster()
net.widget()
net.load_df(df['pred_cat'])
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Zero', inst_color='yellow')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: One', inst_color='red')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Two', inst_color='blue')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Three', inst_color='green')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Four', inst_color='black')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Five', inst_color='orange')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Six', inst_color='purple')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Seven', inst_color='grey')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Eight', inst_color='white')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Digit: Nine', inst_color='brown')
net.load_df(df['pred_cat'].ix[keep_genes])
net.cluster()
net.widget()
# net.load_df(df['sig_max'])
# net.cluster()
# net.widget()
net.load_df(df['conf'])
net.cluster()
net.widget()
ser_correct.sort_values(ascending=False).plot(kind='bar', figsize=(20,5), grid=True)
%%time
num_shuffles = 100
perform_ser = cby.compare_performance_to_shuffled_labels(df['ini'], 'Digit',
num_shuffles=num_shuffles, pval_cutoff=pval_cutoff)
print('mean: ', perform_ser.mean(), 'std: ', perform_ser.std())
real_performance = perform_ser[perform_ser > real_fraction_correct].shape[0]/num_shuffles
print('real labels perform in the top ' + str(real_performance*100) + '% of shuffled labels')
print('previously calc real performance: ', real_fraction_correct)
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import pdist
df['ini'].shape
net.load_df
cols = df['ini'].columns.tolist()
df['ini'].head()
# compute distnace between rows (transpose to get cols as rows)
dist_arr = 1 - pdist(df['ini'].transpose(), metric='cosine')
dist_arr.shape
dist_arr[:100]
from itertools import combinations
sample_combos = list(combinations(range(df['ini'].shape[1]),2))
len(sample_combos)
sample_names = [(cols[x[0]][0] + '_' + cols[x[1]][0], cols[x[0]][1], cols[x[1]][1]) for x in sample_combos]
print(sample_names[0])
print(sample_names[1])
print(sample_names[2])
ser_dist = pd.Series(data=dist_arr, index=sample_names)
ser_dist[0:10]
ser_dist.ix[sample_names[0:2]]
# find intra-cat sample comparisons
intra_cat = [x for x in sample_names if x[1] == x[2]]
print(len(intra_cat))
# find inter-cat sample comparisons
inter_cat = [x for x in sample_names if x[1] != x[2]]
print(len(inter_cat))
print(len(intra_cat) + len(inter_cat))
ser_same = ser_dist[intra_cat]
ser_same.name = 'Same Category'
ser_diff = ser_dist[inter_cat]
ser_diff.name = 'Different Category'
print(ser_same.shape, ser_diff.shape)
ser_same.mean()
ser_diff.mean()
df['sim_cats'] = pd.DataFrame([ser_same, ser_diff]).transpose()
df['sim_cats'].shape
ser_same.hist()
ser_diff.hist()
ser_diff.hist(alpha=0.75)
ser_same.hist(alpha=1.0)
ax = df['sim_cats'].boxplot(return_type='axes')
from scipy.stats import ttest_ind, mannwhitneyu
ttest_ind(ser_diff, ser_same)
ttest_ind(ser_diff, ser_same, equal_var=False)
print(ser_diff.shape)
print(ser_same.shape)
mannwhitneyu(ser_diff, ser_same)
mannwhitneyu(ser_diff, ser_diff)
mannwhitneyu(ser_diff[0:1000], ser_diff[1000:2000])
mannwhitneyu(ser_diff[0:1000], ser_diff[3000:4000])
print(ser_diff[0:1000].mean())
print(ser_diff[3000:4000].mean())
stat, pval = mannwhitneyu(ser_same[0:1000], ser_same[1000:4000])
pval